// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/*
 * Extract case information from Unicode property files.
 * This program reads the UnicodeData-4.0.0.txt file and generates
 * the unicodeutil-lowercase.cpp file that gives a mapping from
 * unicode characters to their lowercase equivalents.
 *
 * Author: Tor Egge
 * Author: Vidar Larsen
 */

#include <vespa/fastlib/io/bufferedfile.h>

unsigned int lowercase[65536];

int pages[256];
int pageoff[256];

char linebuf[1024];

static int hexval(char cp)
{
    if (cp >= '0' && cp <= '9')
        return cp - '0';
    if (cp >= 'a' && cp <= 'f')
        return cp - 'a' + 10;
    if (cp >= 'A' && cp <= 'F')
        return cp - 'A' + 10;
    return -1;
}

void DumpCase(void)
{
    Fast_BufferedFile file;
    unsigned int code;
    int xpage, checkpage, allocpage;

    file.WriteOpen("unicodeutil-lowercase.cpp.NEW");

    allocpage = 0;
    for (xpage = 0; xpage < 256; xpage++) {
        for (checkpage = (xpage == 0) ? 0 : 1; checkpage < xpage; checkpage++) {
            for (code = 0; code < 256; code++)
                if (lowercase[code + 256 * xpage] != lowercase[code + 256 * checkpage])
                    break;
            if (code >= 256)
                break;
        }
        pages[xpage] = checkpage;
        if (checkpage >= xpage) {
            pageoff[xpage] = allocpage * 256;
            allocpage++;
        } else
            pageoff[xpage] = pageoff[checkpage];
    }
    printf("allocpage=%d\n", allocpage);

    file.WriteString("/*\n"
                     " * This file is auto-generated by extcase.cpp\n"
                     " */\n"
                     "\n"
                     "static unsigned short Fast_intCompLowerCase[");
    file.addNum(256 * allocpage, 0, ' ');
    file.WriteString("]={\n");
    for (xpage = 0; xpage < 256; xpage++) {
        if (pages[xpage] == xpage) {
            for (code = 0; code < 256; code++) {
                if ((code & 7) == 0)
                    file.WriteString("  ");
                file.WriteString("0x");
                file.addHexNum(lowercase[code + 256 * xpage], 4, '0');
                file.WriteString("u");
                if (code + 1 < static_cast<unsigned int>(256 * allocpage))
                    file.WriteString(",");
                if ((code & 7) < 7)
                    file.WriteString(" ");
                else
                    file.WriteString("\n");
            }
        }
    }
    file.WriteString("};\n"
                     "\n"
                     "unsigned short *"
                     "Fast_UnicodeUtil::_compLowerCase[256]={\n");

    for (xpage = 0; xpage < 256; xpage++) {
        file.WriteString("  Fast_intCompLowerCase+0x");
        file.addHexNum(pageoff[xpage], 4, '0');
        if (xpage < 255)
            file.WriteString(",");
        else
            file.WriteString(" ");
        file.WriteString(" /* Page 0x");
        file.addHexNum(xpage, 2, '0');
        file.WriteString(" */\n");
    }
    file.WriteString("};\n"
                     "\n"
                     "/* End of auto-generated file */\n");
    file.Close();

    rename("unicodeutil-lowercase.cpp.NEW", "unicodeutil-lowercase.cpp");
}

int
main(int argc, char **argv)
{
    (void)argc;
    (void)argv;

    Fast_BufferedFile file;
    size_t len;
    int lowcode, code, i, j;
    bool badline;
    int semcnt;

    file.ReadOpenExisting("UnicodeData-4.0.0.txt");

    while (!file.Eof()) {
        (void) file.ReadLine(linebuf, sizeof(linebuf));
        len = strlen(linebuf);
        if (len > 0 && linebuf[len - 1] == '\n')
            len--;
        if (len > 0 && linebuf[len - 1] == '\r')
            len--;
        linebuf[len] = '\0';

        badline = false;
        if (hexval(linebuf[0]) >= 0) {
            code = 0;
            for (i = 0; i < 4; i++) {
                if (hexval(linebuf[i]) < 0)
                    badline = true;
                code = code * 16 + hexval(linebuf[i]);
            }

            if (linebuf[4] == ';') {
                semcnt = 0;
                i = 4;
                while (linebuf[i] != '\0') {
                    if (linebuf[i] == ';') {
                        semcnt++;
                    }
                    i++;
                    if (semcnt >= 13)
                        break;
                }
                lowcode = 0;
                if (hexval(linebuf[i]) >= 0) {
                    for (j = 0; j < 4; j++) {
                        if (hexval(linebuf[i + j]) < 0)
                            badline = true;
                        lowcode = lowcode * 16 + hexval(linebuf[i + j]);
                    }
                } else
                    badline = true;
                if (!badline) {
                    lowercase[code] = lowcode;
                } else {
                }
	
            }
        }
    }
    file.Close();

    DumpCase();

    exit(0);
    return 0;
}
